CGI How-To

home *** CD-ROM | disk | FTP | other *** search

/ CGI How-To / CGI HOW-TO.iso / chap5 / 5_1 / parse_pl / parsehtm.pl < prev next >

Wrap

Perl Script | 1996-06-15 | 9.1 KB | 408 lines

#!/usr/local/bin/perl # This package uses the global file handle htmlFile # There are two global assoc. arrays, endTags & handlerDict # parseHtml takes one argument, a filename # and returns the parsed html in a string sub parseHtml { # Declare variables to hold the arguments local($fileName) = @_; # Declare a variable to store the return value local($retVal); # Open the file open(htmlFile,$fileName); # If the file opened, call the parser on it $retVal = &mainHtmlParser("",0) if htmlFile; # Close the file close(htmlFile); # Return the string parsed from the file return $retVal; } # mainHtmlParser takes several arguments # This subroutine can either take a stop string, or a stop char # it reads the file htmlFile until either the end of file # the stopstring or the stop char is encountered. # # mainHtmlParser returns a string filtered from the file. # The filters are tag handlers and a default handler. # Handlers should take 5 arguments for: # # tagString - The string containing the tag # argString - Any data between the tag and end tag # endString - The end tag # tagDict - The dictionary created using dictForTag # userData - The user data argument # # Handlers are registered in the global dictionary # handlerDict. # # If the tag has a matching end tag like <HTML> and </HTML> # then the tag should be registered in the global # %endTags array, with the value equal to its end tag. # # If the tag needs the data up to the end of the line, like # OPTION, then if should appear in %endTags with the value # "eol". # # Handlers should return the string to replace the tag with. # # The default is used for text that wasn't part of a tag. # Tags are denoted by <text>. # As plain text is encountered the handler registered under # the string "DEFAULT" is called. sub mainHtmlParser { # Declare locals to store the arguments local($stopStr,$stopChar) = @_; # Declare several local variables local($char,$inTag,$tmpBuffer,$mainBuffer); # Initialize the main buffer, this is what is returned $mainBuffer = ""; # $inTag is used to denote when we are inside <>'s $inTag = 0; # Loop until the end of the file, or # we encounter the stop string or stop character. do { # Get the next character from the file. # This is not the most effecient method of reading a file # But makes our code cleaner $char = getc(htmlFile); # Check if we are at the start of a tag if($char eq "<") { # Dont allow any tags inside other tags if($inTag) { die "This is an invalid html file.\n"; } else { # Denote that we are in a tag $inTag = 1; # If we were reading plain text if($tmpBuffer) { # Handle the plain text $mainBuffer .= &handlePlainText($tmpBuffer); # Reset the tmp buffer $tmpBuffer = ""; } # Start the new tmp buffer $tmpBuffer = "<"; } } elsif($char eq ">") # Check if we are at the end of a tag { # Dont allow end tags without start tags if(! $inTag) { die "This is an invalid html file.\n"; } else { # Denote the end of the tag $inTag = 0; # Finish the tmp buffer $tmpBuffer .= ">"; # See if we are at the stop string if($stopStr && ($tmpBuffer =~ /$stopStr/i)) { return $mainBuffer;#we have read to the stop string } else { # If not handle the tag, and keep reading $tmpBuffer = &handleTag($tmpBuffer); # Add the tmp buffer to the main buffer $mainBuffer .= $tmpBuffer; # Reset the tmp buffer $tmpBuffer = ""; } } } elsif(eof(htmlFile) || ($stopChar && ($char eq $stopChar))) # check for stopchar { # Dont allow the parsing to end inside a tag if($inTag) { die "This is an invalid html file.\n"; } else { # Add the character to the tmp buffer $tmpBuffer .= $char if (!eof(htmlFile)); # Add the tmp buffer to the main buffer, # after handling it. $mainBuffer .= &handlePlainText($tmpBuffer); # Reset the tmp buffer $tmpBuffer = ""; } # We are at the end of the file, or found # the stop string, so return the main buffer return $mainBuffer; } else # If nothing else add the character to the tmp buffer { $tmpBuffer .= $char; } } until(eof(htmlFile)); # Return the main buffer return $mainBuffer; } # # handleTag actualy handles the tags for mainHtml parser sub handleTag { # Declare local variables for the argument, as well # as the other required locals. local($tagString) = @_; local(%tagDict,$endTag,$handler,$argString); local($evalString); # Create an associative array containing the data for the # tag string. %tagDict = &dictForTag($tagString); # Look for an end tag. These are registered in the %endTags # global associative array. $endTag = $endTags{$tagDict{"TAG"}}; # Look for a handler subroutine for the tag. # These are registered in the %handlerDict global # associative array. $handler = $handlerDict{$tagDict{"TAG"}}; # If no handler is found, treat the tag as plain text, and # return the parsed data. if(!($handler)) { $tagString = &handlePlainText($tagString); return $tagString; } # If the tag wants the data to the end of the line # use mainHtmlParser to read to the end of the line, then # call the tag's handler subroutine with the data to the # end of the line. if($endTag eq "eol") # Tag that needs data to eol { $argString = &mainHtmlParser("","\n"); $evalString = "&".$handler.'($tagString,$argString,0,%tagDict);'; } elsif($endTag) # Tag with an end tag { # Use mainHtmlParser to read any text, up to # the end tag. Remove the end tag from the sting. $argString = &mainHtmlParser($endTag,0); $argString =~ s/<.*>$//; # Remove the end tag # Call the tag's handler $evalString = "&".$handler.'($tagString,$argString,$endTag,%tagDict);'; } else # General unary tag { #For unary tags, simply call the handler. $evalString = "&".$handler.'($tagString,0,0,%tagDict);'; } $tagString = eval($evalString); # Return the parsed text. return $tagString; } # handlePlainText actually handles plain text for htmlMainParser sub handlePlainText { # Declare the locals local($plainString) = @_; local($handler,$evalString); # Look for a default handler for plain text $handler = $handlerDict{"DEFAULT"}; #If there is a handler, call it and catch the return value. if($handler) { $evalString = "&".$handler.'($plainString,0,0,0);'; $plainString = eval($evalString); } # Return either the text passed in, or the parsed text if there # was a default handler. return $plainString; } # Creates an associative array for a tag string sub dictForTag { # Declare locals local($tagString) = @_; local(%tagDict,$key); # Look for the tag # Remove it from the tag string # Capitalize the tag, and put it into the dict # with the key, TAG # If no tag is found, then this is not a tag string. if(($tagString =~ s/^<(\w*)[\s>]//) && $1) { ($key = $1) =~ tr/a-z/A-Z/; # Make the tag upper case $tagDict{"TAG"} = $key; } elsif(($tagString =~ s/^<!--(\w*)[\s>]//) && $1) { ($key = $1) =~ tr/a-z/A-Z/; # Make the tag upper case $tagDict{"TAG"} = $key; } else { return %tagDict; } # Find all of the tag's key/value attrubutes # Remove them from the tag string. while($tagString =~ s/(\w*)\s*=\s*\"([^\"]*)\"//) { if($1) { ($key = $1) =~ tr/a-z/A-Z/; # Make upper case if($2) { $tagDict{$key} = $2; # Add the key to the dict } else { $tagDict{$key} = ""; } } } # Find the single attributes # and remove them from the string. while($tagString =~ s/\s+(\w*)[\s>]*//) { if($1) { ($key = $1) =~ tr/a-z/A-Z/; # Make upper case $tagDict{$key} = $key; # Add to the dict } } return %tagDict; } # Creates a string from a tag dictionary sub stringForTagDict { # Declare locals local(%tagDict) = @_; local($tagString); # If there was a tag dictionary passed in if(%tagDict) { #If the tag dictionary has a TAG in it, build the tag string if($tagDict{"TAG"}) { # Start the string with a < and the tag $tagString .= "<"; $tagString .= $tagDict{"TAG"}; # Add the keys to the string foreach $key (keys %tagDict) { # Ignore TAG, we already added it if($key eq "TAG") { next; } elsif($key eq $tagDict{$key}) # unary attribute { $tagString .= " "; $tagString .= $key; } elsif($tagDict{$key}) #key/value attributes { $tagString .= " "; $tagString .= $key; $tagString .= "= \""; $tagString .= $tagDict{$key}; $tagString .= "\""; } } #Close the tag string $tagString .= ">"; } } #Return the tag string return $tagString; } 1;